{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !wget https://storage.depia.wiki/twitter-dump/2011-10.jsonl.gz\n",
    "# !wget https://storage.depia.wiki/twitter-dump/2011-11.jsonl.gz\n",
    "# !gzip -d 2011-10.jsonl.gz\n",
    "# !gzip -d 2011-11.jsonl.gz\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/dumping/twitter/dumping-twitter-6-july-2019.json\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/dumping/twitter/2020-02-22-twitter-dump-in.json\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/dumping/twitter/2020-03-08-twitter-dump-in.json\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/dumping/twitter/2020-03-28-twitter-dump-in.json\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/dumping/twitter/2020-04-12-twitter-dump-in.json\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/dumping/twitter/2020-04-22-twitter-dump-in.json\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/dumping/twitter/2020-05-02-twitter-dump-in.json\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/dumping/twitter/2020-05-11-twitter-dump-in.json\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/dumping/twitter/2020-05-31-twitter-dump-in.json\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/dumping/twitter/compiled-2021-03-06-twitter.tar\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/dumping/twitter/compiled-2021-04-21-twitter.tar\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/dumping/twitter/compiled-2021-06-06-twitter.tar\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/dumping/twitter/compiled-2022-06-08-twitter.tar\n",
    "\n",
    "# !tar -xf compiled-2021-03-06-twitter.tar\n",
    "# !tar -xf compiled-2021-04-21-twitter.tar\n",
    "# !tar -xf compiled-2021-06-06-twitter.tar\n",
    "# !tar -xf compiled-2022-06-08-twitter.tar\n",
    "# !rm *.tar\n",
    "\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/dumping/twitter/2020-02-22-twitter-dump-en.json\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/dumping/twitter/2020-03-08-twitter-dump-en.json\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/dumping/twitter/2020-03-28-twitter-dump-en.json\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/dumping/twitter/2020-04-12-twitter-dump-en.json\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/dumping/twitter/2020-04-22-twitter-dump-en.json\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/dumping/twitter/2020-05-02-twitter-dump-en.json\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/dumping/twitter/2020-05-11-twitter-dump-en.json\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/dumping/twitter/2020-05-31-twitter-dump-en.json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "\n",
    "def language_detection_textcleaning(string):\n",
    "    string = re.sub(\n",
    "        'http\\\\S+|www.\\\\S+',\n",
    "        '',\n",
    "        ' '.join(\n",
    "            [i for i in string.split() if i.find('#') < 0 and i.find('@') < 0]\n",
    "        ),\n",
    "    )\n",
    "\n",
    "    chars = ',.()!:\\'\"/;=-'\n",
    "    for c in chars:\n",
    "        string = string.replace(c, f' {c} ')\n",
    "    string = string.replace('\\n', '').replace('\\t', '')\n",
    "\n",
    "    string = re.sub(\n",
    "        '[0-9!@#$%^&*()_\\\\-+{}|\\\\~`\\'\";:?/.>,<]', ' ', string, flags=re.UNICODE\n",
    "    )\n",
    "    string = re.sub(r'[ ]+', ' ', string).strip()\n",
    "\n",
    "    return string.lower()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['/home/husein/dev/malaya/pretrained-model/prepare-llm/dumping-twitter-6-july-2019.json',\n",
       " '/home/husein/dev/malaya/pretrained-model/prepare-llm/2020-04-12-twitter-dump-in.json',\n",
       " '/home/husein/dev/malaya/pretrained-model/prepare-llm/2020-04-22-twitter-dump-in.json',\n",
       " '/home/husein/dev/malaya/pretrained-model/prepare-llm/2020-05-11-twitter-dump-in.json',\n",
       " '/home/husein/dev/malaya/pretrained-model/prepare-llm/2020-05-02-twitter-dump-in.json',\n",
       " '/home/husein/dev/malaya/pretrained-model/prepare-llm/2020-05-31-twitter-dump-in.json',\n",
       " '/home/husein/dev/malaya/pretrained-model/prepare-llm/2020-03-28-twitter-dump-in.json',\n",
       " '/home/husein/dev/malaya/pretrained-model/prepare-llm/2020-02-22-twitter-dump-in.json',\n",
       " '/home/husein/dev/malaya/pretrained-model/prepare-llm/2020-03-08-twitter-dump-in.json',\n",
       " '/home/husein/dev/malaya/pretrained-model/prepare-llm/compiled/2021-dump-bahasa-twitter-15.json',\n",
       " '/home/husein/dev/malaya/pretrained-model/prepare-llm/compiled/2021-dump-bahasa-twitter-1.json',\n",
       " '/home/husein/dev/malaya/pretrained-model/prepare-llm/compiled/2021-dump-bahasa-twitter-33.json',\n",
       " '/home/husein/dev/malaya/pretrained-model/prepare-llm/compiled/2021-dump-bahasa-twitter-31.json',\n",
       " '/home/husein/dev/malaya/pretrained-model/prepare-llm/compiled/2021-dump-bahasa-twitter-40.json',\n",
       " '/home/husein/dev/malaya/pretrained-model/prepare-llm/compiled/2021-dump-bahasa-twitter-20.json',\n",
       " '/home/husein/dev/malaya/pretrained-model/prepare-llm/compiled/2021-dump-bahasa-twitter-4.json',\n",
       " '/home/husein/dev/malaya/pretrained-model/prepare-llm/compiled/2021-dump-bahasa-twitter-26.json',\n",
       " '/home/husein/dev/malaya/pretrained-model/prepare-llm/compiled/2021-dump-bahasa-twitter-8.json',\n",
       " '/home/husein/dev/malaya/pretrained-model/prepare-llm/compiled/2021-dump-bahasa-twitter-11.json',\n",
       " '/home/husein/dev/malaya/pretrained-model/prepare-llm/compiled/2021-dump-bahasa-twitter-16.json',\n",
       " '/home/husein/dev/malaya/pretrained-model/prepare-llm/compiled/2021-dump-bahasa-twitter-10.json',\n",
       " '/home/husein/dev/malaya/pretrained-model/prepare-llm/compiled/2021-dump-bahasa-twitter-32.json',\n",
       " '/home/husein/dev/malaya/pretrained-model/prepare-llm/compiled/2021-dump-bahasa-twitter-29.json',\n",
       " '/home/husein/dev/malaya/pretrained-model/prepare-llm/compiled/2021-dump-bahasa-twitter-22.json',\n",
       " '/home/husein/dev/malaya/pretrained-model/prepare-llm/compiled/2021-dump-bahasa-twitter-30.json',\n",
       " '/home/husein/dev/malaya/pretrained-model/prepare-llm/compiled/2021-dump-bahasa-twitter-17.json',\n",
       " '/home/husein/dev/malaya/pretrained-model/prepare-llm/compiled/2021-dump-bahasa-twitter-19.json',\n",
       " '/home/husein/dev/malaya/pretrained-model/prepare-llm/compiled/2021-dump-bahasa-twitter-38.json',\n",
       " '/home/husein/dev/malaya/pretrained-model/prepare-llm/compiled/2021-dump-bahasa-twitter-41.json',\n",
       " '/home/husein/dev/malaya/pretrained-model/prepare-llm/compiled/2021-dump-bahasa-twitter-24.json',\n",
       " '/home/husein/dev/malaya/pretrained-model/prepare-llm/compiled/2021-dump-bahasa-twitter-27.json',\n",
       " '/home/husein/dev/malaya/pretrained-model/prepare-llm/compiled/2021-dump-bahasa-twitter-0.json',\n",
       " '/home/husein/dev/malaya/pretrained-model/prepare-llm/compiled/2021-dump-bahasa-twitter-12.json',\n",
       " '/home/husein/dev/malaya/pretrained-model/prepare-llm/compiled/2021-dump-bahasa-twitter-25.json',\n",
       " '/home/husein/dev/malaya/pretrained-model/prepare-llm/compiled/2021-dump-bahasa-twitter-14.json',\n",
       " '/home/husein/dev/malaya/pretrained-model/prepare-llm/compiled/2021-dump-bahasa-twitter-3.json',\n",
       " '/home/husein/dev/malaya/pretrained-model/prepare-llm/compiled/2021-dump-bahasa-twitter-18.json',\n",
       " '/home/husein/dev/malaya/pretrained-model/prepare-llm/compiled/2021-dump-bahasa-twitter-34.json',\n",
       " '/home/husein/dev/malaya/pretrained-model/prepare-llm/compiled/2021-dump-bahasa-twitter-21.json',\n",
       " '/home/husein/dev/malaya/pretrained-model/prepare-llm/compiled/2021-dump-bahasa-twitter-9.json',\n",
       " '/home/husein/dev/malaya/pretrained-model/prepare-llm/compiled/2021-dump-bahasa-twitter-2.json',\n",
       " '/home/husein/dev/malaya/pretrained-model/prepare-llm/compiled/2021-dump-bahasa-twitter-7.json',\n",
       " '/home/husein/dev/malaya/pretrained-model/prepare-llm/compiled/2021-dump-bahasa-twitter-35.json',\n",
       " '/home/husein/dev/malaya/pretrained-model/prepare-llm/compiled/2021-dump-bahasa-twitter-28.json',\n",
       " '/home/husein/dev/malaya/pretrained-model/prepare-llm/compiled/2021-dump-bahasa-twitter-39.json',\n",
       " '/home/husein/dev/malaya/pretrained-model/prepare-llm/compiled/2021-dump-bahasa-twitter-37.json',\n",
       " '/home/husein/dev/malaya/pretrained-model/prepare-llm/compiled/2021-dump-bahasa-twitter-36.json',\n",
       " '/home/husein/dev/malaya/pretrained-model/prepare-llm/compiled/2021-dump-bahasa-twitter-23.json',\n",
       " '/home/husein/dev/malaya/pretrained-model/prepare-llm/compiled/2021-dump-bahasa-twitter-13.json',\n",
       " '/home/husein/dev/malaya/pretrained-model/prepare-llm/compiled/2021-dump-bahasa-twitter-5.json',\n",
       " '/home/husein/dev/malaya/pretrained-model/prepare-llm/compiled/2021-dump-bahasa-twitter-6.json']"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from glob import glob\n",
    "from tqdm import tqdm\n",
    "import json\n",
    "\n",
    "bahasa = ['/home/husein/dev/malaya/pretrained-model/prepare-llm/dumping-twitter-6-july-2019.json']\n",
    "bahasa.extend(glob('/home/husein/dev/malaya/pretrained-model/prepare-llm/*twitter-dump-in.json'))\n",
    "bahasa.extend(glob('/home/husein/dev/malaya/pretrained-model/prepare-llm/compiled/*.json'))\n",
    "bahasa"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2016"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import json\n",
    "\n",
    "with open('indon-words-socialmedia.json') as fopen:\n",
    "    indon_words_socialmedia = set(json.load(fopen))\n",
    "    \n",
    "len(indon_words_socialmedia)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [],
   "source": [
    "id_open = open('filter-twitter-id.jsonl', 'w')\n",
    "other_open = open('filter-twitter-other.jsonl', 'w')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "500000it [00:23, 21182.66it/s]\n",
      "500000it [00:24, 20158.75it/s]\n",
      "500000it [00:24, 20793.39it/s]\n",
      "500000it [00:25, 19869.26it/s]\n",
      "500000it [00:25, 19904.92it/s]\n",
      "75025it [00:03, 20441.79it/s]\n",
      "500000it [00:24, 20041.20it/s]\n",
      "500000it [00:24, 20209.79it/s]\n",
      "500000it [00:24, 20304.93it/s]\n",
      "500000it [00:24, 20426.33it/s]\n",
      "500000it [00:24, 20584.19it/s]\n",
      "500000it [00:24, 20249.99it/s]\n",
      "500000it [00:24, 20277.54it/s]\n",
      "500000it [00:24, 20606.06it/s]\n",
      "500000it [00:24, 20600.47it/s]\n"
     ]
    }
   ],
   "source": [
    "for f in glob('/home/husein/ssd3/twitter/*.splitted'):\n",
    "    with open(f) as fopen:\n",
    "        for l in tqdm(fopen):\n",
    "            data = json.loads(l)\n",
    "            if data['lang'] != 'in':\n",
    "                continue\n",
    "                \n",
    "            t = data['data_text']\n",
    "            t = language_detection_textcleaning(t)\n",
    "            if len(set(t.split()) & indon_words_socialmedia):\n",
    "                f = id_open\n",
    "            else:\n",
    "                f = other_open\n",
    "            \n",
    "            f.write(f'{json.dumps(t)}\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2707442 filter-twitter-other.jsonl\n",
      "739338 filter-twitter-id.jsonl\n"
     ]
    }
   ],
   "source": [
    "!wc -l filter-twitter-other.jsonl\n",
    "!wc -l filter-twitter-id.jsonl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|███████████████████████████████████████████████████████████████████████████████| 6597867/6597867 [00:59<00:00, 110622.04it/s]\n",
      "100%|████████████████████████████████████████████████████████████████████████████████| 1673772/1673772 [00:17<00:00, 95189.82it/s]\n",
      "100%|████████████████████████████████████████████████████████████████████████████████| 1121928/1121928 [00:11<00:00, 94375.21it/s]\n",
      "100%|████████████████████████████████████████████████████████████████████████████████| 1134736/1134736 [00:11<00:00, 96324.20it/s]\n",
      "100%|████████████████████████████████████████████████████████████████████████████████| 1175497/1175497 [00:12<00:00, 94215.32it/s]\n",
      "100%|████████████████████████████████████████████████████████████████████████████████| 2131906/2131906 [00:21<00:00, 96982.65it/s]\n",
      "100%|████████████████████████████████████████████████████████████████████████████████| 1250595/1250595 [00:13<00:00, 90897.76it/s]\n",
      "100%|████████████████████████████████████████████████████████████████████████████████| 1617795/1617795 [00:17<00:00, 90428.12it/s]\n",
      "100%|████████████████████████████████████████████████████████████████████████████████| 1711555/1711555 [00:18<00:00, 90202.79it/s]\n",
      "100%|██████████████████████████████████████████████████████████████████████████████████| 100000/100000 [00:01<00:00, 84448.41it/s]\n",
      "100%|██████████████████████████████████████████████████████████████████████████████████| 100000/100000 [00:01<00:00, 82105.20it/s]\n",
      "100%|██████████████████████████████████████████████████████████████████████████████████| 100000/100000 [00:01<00:00, 81033.43it/s]\n",
      "100%|██████████████████████████████████████████████████████████████████████████████████| 100000/100000 [00:01<00:00, 79940.70it/s]\n",
      "100%|██████████████████████████████████████████████████████████████████████████████████| 100000/100000 [00:01<00:00, 84547.19it/s]\n",
      "100%|██████████████████████████████████████████████████████████████████████████████████| 100000/100000 [00:01<00:00, 83702.74it/s]\n",
      "100%|██████████████████████████████████████████████████████████████████████████████████| 100000/100000 [00:01<00:00, 87227.00it/s]\n",
      "100%|██████████████████████████████████████████████████████████████████████████████████| 100000/100000 [00:01<00:00, 80759.83it/s]\n",
      "100%|██████████████████████████████████████████████████████████████████████████████████| 100000/100000 [00:01<00:00, 87108.32it/s]\n",
      "100%|██████████████████████████████████████████████████████████████████████████████████| 100000/100000 [00:01<00:00, 81876.23it/s]\n",
      "100%|██████████████████████████████████████████████████████████████████████████████████| 100000/100000 [00:01<00:00, 88724.34it/s]\n",
      "100%|██████████████████████████████████████████████████████████████████████████████████| 100000/100000 [00:01<00:00, 89650.79it/s]\n",
      "100%|██████████████████████████████████████████████████████████████████████████████████| 100000/100000 [00:01<00:00, 82275.25it/s]\n",
      "100%|██████████████████████████████████████████████████████████████████████████████████| 100000/100000 [00:01<00:00, 82879.64it/s]\n",
      "100%|██████████████████████████████████████████████████████████████████████████████████| 100000/100000 [00:01<00:00, 80101.86it/s]\n",
      "100%|██████████████████████████████████████████████████████████████████████████████████| 100000/100000 [00:01<00:00, 86649.31it/s]\n",
      "100%|██████████████████████████████████████████████████████████████████████████████████| 100000/100000 [00:01<00:00, 87001.68it/s]\n",
      "100%|██████████████████████████████████████████████████████████████████████████████████| 100000/100000 [00:01<00:00, 86150.69it/s]\n",
      "100%|██████████████████████████████████████████████████████████████████████████████████| 100000/100000 [00:01<00:00, 79148.43it/s]\n",
      "100%|██████████████████████████████████████████████████████████████████████████████████| 100000/100000 [00:01<00:00, 86100.22it/s]\n",
      "100%|██████████████████████████████████████████████████████████████████████████████████| 100000/100000 [00:01<00:00, 80802.29it/s]\n",
      "100%|██████████████████████████████████████████████████████████████████████████████████| 100000/100000 [00:01<00:00, 82852.81it/s]\n",
      "100%|██████████████████████████████████████████████████████████████████████████████████| 100000/100000 [00:01<00:00, 85050.30it/s]\n",
      "100%|██████████████████████████████████████████████████████████████████████████████████| 100000/100000 [00:01<00:00, 85040.57it/s]\n",
      "100%|██████████████████████████████████████████████████████████████████████████████████| 100000/100000 [00:01<00:00, 78241.15it/s]\n",
      "100%|██████████████████████████████████████████████████████████████████████████████████| 100000/100000 [00:01<00:00, 84989.79it/s]\n",
      "100%|██████████████████████████████████████████████████████████████████████████████████| 100000/100000 [00:01<00:00, 86967.03it/s]\n",
      "100%|██████████████████████████████████████████████████████████████████████████████████| 100000/100000 [00:01<00:00, 85945.52it/s]\n",
      "100%|██████████████████████████████████████████████████████████████████████████████████| 100000/100000 [00:01<00:00, 84045.13it/s]\n",
      "100%|██████████████████████████████████████████████████████████████████████████████████| 100000/100000 [00:01<00:00, 84929.46it/s]\n",
      "100%|██████████████████████████████████████████████████████████████████████████████████| 100000/100000 [00:01<00:00, 84059.85it/s]\n",
      "100%|██████████████████████████████████████████████████████████████████████████████████| 100000/100000 [00:01<00:00, 87301.51it/s]\n",
      "100%|██████████████████████████████████████████████████████████████████████████████████| 100000/100000 [00:01<00:00, 92590.41it/s]\n",
      "100%|██████████████████████████████████████████████████████████████████████████████████| 100000/100000 [00:01<00:00, 83412.36it/s]\n",
      "100%|██████████████████████████████████████████████████████████████████████████████████| 100000/100000 [00:01<00:00, 86397.29it/s]\n",
      "100%|██████████████████████████████████████████████████████████████████████████████████| 100000/100000 [00:01<00:00, 84095.31it/s]\n",
      "100%|██████████████████████████████████████████████████████████████████████████████████| 100000/100000 [00:01<00:00, 82278.46it/s]\n",
      "100%|██████████████████████████████████████████████████████████████████████████████████| 100000/100000 [00:01<00:00, 84513.07it/s]\n",
      "100%|██████████████████████████████████████████████████████████████████████████████████| 100000/100000 [00:01<00:00, 81365.18it/s]\n",
      "100%|██████████████████████████████████████████████████████████████████████████████████| 100000/100000 [00:01<00:00, 86851.57it/s]\n",
      "100%|██████████████████████████████████████████████████████████████████████████████████| 100000/100000 [00:01<00:00, 91626.50it/s]\n",
      "100%|██████████████████████████████████████████████████████████████████████████████████| 100000/100000 [00:01<00:00, 88659.81it/s]\n"
     ]
    }
   ],
   "source": [
    "for f in bahasa:\n",
    "    with open(f) as fopen:\n",
    "        data = json.load(fopen)\n",
    "\n",
    "    for d in tqdm(data):\n",
    "        if isinstance(d, dict):\n",
    "            t = d['data_text']\n",
    "        else:\n",
    "            t = d\n",
    "\n",
    "        t = language_detection_textcleaning(t)\n",
    "        if len(set(t.split()) & indon_words_socialmedia):\n",
    "            f = id_open\n",
    "        else:\n",
    "            f = other_open\n",
    "\n",
    "        f.write(f'{json.dumps(t)}\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [],
   "source": [
    "id_open.close()\n",
    "other_open.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "20662653 filter-twitter-other.jsonl\n",
      "5399868 filter-twitter-id.jsonl\n"
     ]
    }
   ],
   "source": [
    "!wc -l filter-twitter-other.jsonl\n",
    "!wc -l filter-twitter-id.jsonl"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
